#Course: BUAN 5210
#Purpose: Analyze effectiveness of Advertisement and Promotion, and comparison with competitors
#Date: 11/02/2019
#Author: Akira Nakagawa, Anh Doan
Preparing libraries and data for analysis
# -- Clear environment of variables and functions------------------
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
# -- Load packages ---------------------------------------
#install.packages("tidyverse")
library(tidyverse)
# The gridExtra package contains grid.arrange function used to combine plots in the same window
library(gridExtra)
# The janitor package contains tidyverse functions for cross-tables
library(janitor)
# The knitr package contains some table formating functions
library(knitr)
# The GGally package contains a custom correlation plot we will use
library(GGally)
library(MultinomialCI)
library(htmlTable)
library(Hmisc)
library(formattable)
library(rms)
library(MultNonParam)
# -- Load data from mtp_data.csv -----------------------------------
mtp <- read.csv("mtp_data.csv")
# -- data cleansing -----------------------------------
# Convert into factor variables
mtp$promo <- factor(mtp$promo)
mtp$iri_key <- factor(mtp$iri_key)
# Create a column for Company based on the column "brand"
Kell <- c("KELLOGGS COCOA KRISPIES", "KELLOGGS FROOT LOOPS", "KELLOGGS FROSTED MINI WHEATS", "KELLOGGS RAISIN BRAN", "KELLOGGS RICE KRISPIES", "KELLOGGS SMART START", "KELLOGGS SPECIAL K")
GM <- c("GENERAL MILLS CHEERIOS", "GENERAL MILLS CINNAMON TST CR", "GENERAL MILLS COCOA PUFFS", "GENERAL MILLS KIX", "GENERAL MILLS LUCKY CHARMS")
Po <- c("POST GRAPE NUTS", "POST SHREDDED WHEAT")
mtp <- transform(mtp, company= if_else( mtp$brand %in% Kell, "Kelloggs", if_else(mtp$brand %in% GM, "GM", "Post")))
# to look at the data
head(mtp)
## UPC iri_key week units brand
## 1 00-01-16000-11653 644347 1484 5 GENERAL MILLS CINNAMON TST CR
## 2 00-01-16000-11653 248741 1483 2 GENERAL MILLS CINNAMON TST CR
## 3 00-01-16000-11653 535806 1489 3 GENERAL MILLS CINNAMON TST CR
## 4 00-01-16000-11945 675634 1489 2 GENERAL MILLS CHEERIOS
## 5 00-01-16000-11945 205272 1491 8 GENERAL MILLS CHEERIOS
## 6 00-01-16000-11945 248741 1492 5 GENERAL MILLS CHEERIOS
## flavor package volume price promo ad company
## 1 CINNAMON TOAST BOX 0.06 0.5 0 A GM
## 2 CINNAMON TOAST BOX 0.06 0.5 0 NONE GM
## 3 CINNAMON TOAST BOX 0.06 0.5 0 NONE GM
## 4 TOASTED BOX 0.04 0.5 0 NONE GM
## 5 TOASTED BOX 0.04 0.5 0 NONE GM
## 6 TOASTED BOX 0.04 0.5 0 NONE GM
# to see how many observations, variables, types etc
str(mtp)
## 'data.frame': 21850 obs. of 12 variables:
## $ UPC : Factor w/ 114 levels "00-01-16000-11653",..: 1 1 1 2 2 2 3 3 3 3 ...
## $ iri_key: Factor w/ 1420 levels "200171","200197",..: 1041 446 1018 1217 48 446 1295 794 1184 1043 ...
## $ week : int 1484 1483 1489 1489 1491 1492 1517 1513 1523 1483 ...
## $ units : int 5 2 3 2 8 5 6 1 4 14 ...
## $ brand : Factor w/ 15 levels "GENERAL MILLS CHEERIOS",..: 2 2 2 1 1 1 2 2 2 2 ...
## $ flavor : Factor w/ 5 levels "CINNAMON TOAST",..: 1 1 1 5 5 5 1 1 1 1 ...
## $ package: Factor w/ 2 levels "BOX","CUP": 1 1 1 1 1 1 2 2 2 2 ...
## $ volume : num 0.06 0.06 0.06 0.04 0.04 0.04 0.12 0.12 0.12 0.12 ...
## $ price : num 0.5 0.5 0.5 0.5 0.5 0.5 1.09 1.59 1.59 1 ...
## $ promo : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ad : Factor w/ 3 levels "A","B","NONE": 1 3 3 3 3 3 3 3 3 3 ...
## $ company: Factor w/ 3 levels "GM","Kelloggs",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(mtp)
## UPC iri_key week units
## 00-01-43000-10521: 676 656444 : 35 Min. :1479 Min. : 1.000
## 00-01-38000-01621: 666 256951 : 31 1st Qu.:1492 1st Qu.: 3.000
## 00-01-38000-00828: 660 259661 : 31 Median :1505 Median : 7.000
## 00-01-16000-27569: 639 267403 : 31 Mean :1505 Mean : 8.579
## 00-02-38000-66330: 618 652139 : 31 3rd Qu.:1518 3rd Qu.:12.000
## 00-01-38000-01611: 612 681735 : 31 Max. :1530 Max. :28.000
## (Other) :17979 (Other):21660
## brand flavor package
## KELLOGGS FROSTED FLAKES : 2295 CINNAMON TOAST:1834 BOX:21306
## KELLOGGS FROOT LOOPS : 2192 COCOA :1901 CUP: 544
## GENERAL MILLS CINNAMON TST CR: 1834 FRUIT :2192
## GENERAL MILLS LUCKY CHARMS : 1681 REGULAR :8816
## KELLOGGS FROSTED MINI WHEATS : 1574 TOASTED :7107
## GENERAL MILLS CHEERIOS : 1458
## (Other) :10816
## volume price promo ad company
## Min. :0.040 Min. :0.250 0:17305 A : 1456 GM :7189
## 1st Qu.:0.750 1st Qu.:3.190 1: 4545 B : 1061 Kelloggs:9888
## Median :1.060 Median :3.790 NONE:19333 Post :4773
## Mean :1.016 Mean :3.763
## 3rd Qu.:1.120 3rd Qu.:4.390
## Max. :4.000 Max. :9.990
##
grid.arrange(
# company
ggplot(data = mtp, mapping = aes(x = company)) +
geom_bar(),
# brand
ggplot(data = mtp, mapping = aes(x = brand)) +
geom_bar(),
# flavor
ggplot(data = mtp, mapping = aes(x = flavor)) +
geom_bar(),
# package
ggplot(data = mtp, mapping = aes(x = package)) +
geom_bar(),
# promo
ggplot(data = mtp, mapping = aes(x = promo)) +
geom_bar(),
# ad
ggplot(data = mtp, mapping = aes(x = ad)) +
geom_bar(),
ncol = 1 )
grid.arrange(
# Create histogram
ggplot(data = mtp, mapping = aes(x = units)) +
geom_histogram(),
# Add boxplot
ggplot(data = mtp, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = units)) +
coord_flip(), # use to have same x-axis on both graphs
# Set number of columns in grid.arrange
ncol = 1 )
grid.arrange(
# Create histogram
ggplot(data = mtp, mapping = aes(x = volume)) +
geom_histogram(),
# Add boxplot
ggplot(data = mtp, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = volume)) +
coord_flip(), # use to have same x-axis on both graphs
# Set number of columns in grid.arrange
ncol = 1 )
grid.arrange(
# Create histogram
ggplot(data = mtp, mapping = aes(x = price)) +
geom_histogram(),
# Add boxplot
ggplot(data = mtp, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = price)) +
coord_flip(), # use to have same x-axis on both graphs
# Set number of columns in grid.arrange
ncol = 1 )
Company and Flavor
# Company and Flavor
mtp %>%
tabyl(flavor, company) %>% # creates table of counts
adorn_totals(where = c("row", "col")) # Total margins
## flavor GM Kelloggs Post Total
## CINNAMON TOAST 1834 0 0 1834
## COCOA 1020 881 0 1901
## FRUIT 0 2192 0 2192
## REGULAR 1203 2840 4773 8816
## TOASTED 3132 3975 0 7107
## Total 7189 9888 4773 21850
mtp %>%
tabyl(flavor, company) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>% # creates proportions
adorn_rounding(2) # round decimals
## flavor GM Kelloggs Post Total
## CINNAMON TOAST 0.08 0.00 0.00 0.08
## COCOA 0.05 0.04 0.00 0.09
## FRUIT 0.00 0.10 0.00 0.10
## REGULAR 0.06 0.13 0.22 0.40
## TOASTED 0.14 0.18 0.00 0.33
## Total 0.33 0.45 0.22 1.00
Package and brand(comapny)
# package and brand
mtp %>%
tabyl(brand, package) %>% # creates table of counts
adorn_totals(where = c("row", "col")) # Total margins
## brand BOX CUP Total
## GENERAL MILLS CHEERIOS 1411 47 1458
## GENERAL MILLS CINNAMON TST CR 1774 60 1834
## GENERAL MILLS COCOA PUFFS 1020 0 1020
## GENERAL MILLS KIX 1196 0 1196
## GENERAL MILLS LUCKY CHARMS 1624 57 1681
## KELLOGGS COCOA KRISPIES 877 4 881
## KELLOGGS FROOT LOOPS 2099 93 2192
## KELLOGGS FROSTED FLAKES 2158 137 2295
## KELLOGGS FROSTED MINI WHEATS 1539 35 1574
## KELLOGGS RAISIN BRAN 1266 0 1266
## KELLOGGS RICE KRISPIES 1425 25 1450
## KELLOGGS SMART START 1134 0 1134
## KELLOGGS SPECIAL K 1305 86 1391
## POST GRAPE NUTS 1289 0 1289
## POST SHREDDED WHEAT 1189 0 1189
## Total 21306 544 21850
mtp %>%
tabyl(brand, package) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>% # creates proportions
adorn_rounding(2) # round decimals
## brand BOX CUP Total
## GENERAL MILLS CHEERIOS 0.06 0.00 0.07
## GENERAL MILLS CINNAMON TST CR 0.08 0.00 0.08
## GENERAL MILLS COCOA PUFFS 0.05 0.00 0.05
## GENERAL MILLS KIX 0.05 0.00 0.05
## GENERAL MILLS LUCKY CHARMS 0.07 0.00 0.08
## KELLOGGS COCOA KRISPIES 0.04 0.00 0.04
## KELLOGGS FROOT LOOPS 0.10 0.00 0.10
## KELLOGGS FROSTED FLAKES 0.10 0.01 0.11
## KELLOGGS FROSTED MINI WHEATS 0.07 0.00 0.07
## KELLOGGS RAISIN BRAN 0.06 0.00 0.06
## KELLOGGS RICE KRISPIES 0.07 0.00 0.07
## KELLOGGS SMART START 0.05 0.00 0.05
## KELLOGGS SPECIAL K 0.06 0.00 0.06
## POST GRAPE NUTS 0.06 0.00 0.06
## POST SHREDDED WHEAT 0.05 0.00 0.05
## Total 0.98 0.02 1.00
Package and company
# package and company
mtp %>%
tabyl(company, package) %>% # creates table of counts
adorn_totals(where = c("row", "col")) # Total margins
## company BOX CUP Total
## GM 7025 164 7189
## Kelloggs 9645 243 9888
## Post 4636 137 4773
## Total 21306 544 21850
mtp %>%
tabyl(company, package) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>% # creates proportions
adorn_rounding(2) # round decimals
## company BOX CUP Total
## GM 0.32 0.01 0.33
## Kelloggs 0.44 0.01 0.45
## Post 0.21 0.01 0.22
## Total 0.98 0.02 1.00
Company and Ad
# ad and company
mtp %>%
tabyl(company, ad) %>% # creates table of counts
adorn_totals(where = c("row", "col")) # Total margins
## company A B NONE Total
## GM 442 272 6475 7189
## Kelloggs 762 540 8586 9888
## Post 252 249 4272 4773
## Total 1456 1061 19333 21850
mtp %>%
tabyl(company, ad) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>% # creates proportions
adorn_rounding(2) # round decimals
## company A B NONE Total
## GM 0.02 0.01 0.30 0.33
## Kelloggs 0.03 0.02 0.39 0.45
## Post 0.01 0.01 0.20 0.22
## Total 0.07 0.05 0.88 1.00
Company and Promo
# promo and company
mtp %>%
tabyl(company, promo) %>% # creates table of counts
adorn_totals(where = c("row", "col")) # Total margins
## company 0 1 Total
## GM 5909 1280 7189
## Kelloggs 7651 2237 9888
## Post 3745 1028 4773
## Total 17305 4545 21850
mtp %>%
tabyl(company, promo) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>% # creates proportions
adorn_rounding(2) # round decimals
## company 0 1 Total
## GM 0.27 0.06 0.33
## Kelloggs 0.35 0.10 0.45
## Post 0.17 0.05 0.22
## Total 0.79 0.21 1.00
# Correlation table
mtp %>%
select_if(is.numeric) %>% # Use to select just the numeric variables
cor() %>%
round(2) %>%
kable()
week units volume price
# More detail on promo,ad and company
grid.arrange(
# ad and company
mtp %>%
ggplot(mapping = aes(x = ad, fill = company)) +
geom_bar(position = "dodge"),
# promo and company
mtp %>%
ggplot(mapping = aes(x = promo , fill = company)) +
geom_bar(position = "dodge"),
# flavor and company
mtp %>%
ggplot(mapping = aes(x = flavor , fill = company)) +
geom_bar(position = "dodge"),
# package and company
mtp %>%
ggplot(mapping = aes(x = package , fill = company)) +
geom_bar(position = "dodge"),
ncol = 1
)
# More detail on company and ad
grid.arrange(
mtp %>%
ggplot(mapping = aes(x = company, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip(),
mtp %>%
ggplot(mapping = aes(x = company, fill = ad)) +
geom_bar(position = "fill") +
coord_flip(),
ncol = 1
)
# More detail on brand and promo
grid.arrange(
# Cluster of counts
mtp %>%
ggplot(mapping = aes(x = company, fill = promo)) +
geom_bar(position = "dodge") +
coord_flip(),
# Proportion of counts
mtp %>%
ggplot(mapping = aes(x = company, fill = promo)) +
geom_bar(position = "fill") +
coord_flip(),
ncol = 1
)
mtp %>%
group_by(ad, company) %>%
summarise(count = n()) %>%
ggplot(aes(ad, company)) +
geom_tile(aes(fill = count))
# price and units
mtp %>%
ggplot(mapping = aes(x = units, y = price)) +
geom_point()
# volume and price
mtp %>%
ggplot(mapping = aes(x = volume, y = price)) +
geom_point()
# volume and units
mtp %>%
ggplot(mapping = aes(x = volume, y = units)) +
geom_point()
mtp %>%
select(week, units, brand, flavor, package, volume, price, promo, ad, company) %>%
ggpairs()
# volume, price and company
mtp %>%
ggplot(mapping = aes(x = volume, y = price, color = company)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# volume, price and brand
mtp %>%
ggplot(mapping = aes(x = volume, y = price, color = brand)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# volume, price and flavor
mtp %>%
ggplot(mapping = aes(x = volume, y = price, color = flavor)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# volume, price and ad
mtp %>%
ggplot(mapping = aes(x = volume, y = price, color = ad)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
grid.arrange(
# volume and flavor
mtp %>%
ggplot(mapping = aes(x = flavor, y = volume)) +
geom_boxplot(),
# price nad flavor
mtp %>%
ggplot(mapping = aes(x = flavor, y = price)) +
geom_boxplot(),
# price nad flavor
mtp %>%
ggplot(mapping = aes(x = company, y = price)) +
geom_boxplot(),
# price nad flavor
mtp %>%
ggplot(mapping = aes(x = company, y = units)) +
geom_boxplot(),
ncol = 2
)
Units and company/ad
mtp %>%
group_by(ad, company) %>%
summarise(med_units = median(units)) %>%
ggplot(aes(ad, company)) +
geom_tile(aes(fill = med_units))
Units and company/promo
mtp %>%
group_by(promo, company) %>%
summarise(med_units = median(units)) %>%
ggplot(aes(promo, company)) +
geom_tile(aes(fill = med_units))
Do units differ by promo, ads, or company(producer)
# promo and units
mtp %>%
ggplot(mapping = aes(x = units)) +
geom_histogram() +
facet_wrap( ~ promo)
# ad and units
mtp %>%
ggplot(mapping = aes(x = units)) +
geom_histogram() +
facet_wrap( ~ ad)
# company and units
mtp %>%
ggplot(mapping = aes(x = units)) +
geom_histogram() +
facet_wrap( ~ company)
mtp %>%
ggplot(mapping = aes(x = volume, y = units)) +
geom_point() +
geom_hline(yintercept = median(mtp$units), color="blue") +
annotate(geom = "text", label = "units mean", x=3, y = 6, color="blue") +
geom_smooth(mapping = aes(color = promo), method = "lm", se = FALSE)
mtp %>%
ggplot(mapping = aes(x = volume, y = units)) +
geom_point() +
geom_hline(yintercept = median(mtp$units), color="blue") +
annotate(geom = "text", label = "units mean", x=3, y = 6, color="blue") +
geom_smooth(mapping = aes(color = ad), method = "lm", se = FALSE)
mtp %>%
ggplot(mapping = aes(x = volume, y = units)) +
geom_point() +
geom_hline(yintercept = median(mtp$units), color="blue") +
annotate(geom = "text", label = "units mean", x=3, y = 6, color="blue") +
geom_smooth(mapping = aes(color = company), method = "lm", se = FALSE)
#promo
(t <- t.test(mtp$units[mtp$promo == '1'], mtp$units[mtp$promo == '0'], conf.level = 0.95))
##
## Welch Two Sample t-test
##
## data: mtp$units[mtp$promo == "1"] and mtp$units[mtp$promo == "0"]
## t = 27.59, df = 6398.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.070437 3.540125
## sample estimates:
## mean of x mean of y
## 11.196700 7.891419
#ad
(t <- t.test(mtp$units[mtp$ad == 'B'], mtp$units[mtp$ad == 'NONE'], conf.level = 0.95))
##
## Welch Two Sample t-test
##
## data: mtp$units[mtp$ad == "B"] and mtp$units[mtp$ad == "NONE"]
## t = 12.471, df = 1145.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.526639 3.470063
## sample estimates:
## mean of x mean of y
## 11.268615 8.270263
z <- qnorm(0.975) #95 percent
mtp %>%
group_by(promo) %>%
summarise(mn = mean(units), sd = sd(units), n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = promo, y = mn)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = mn - ci, ymax = mn + ci), width = 0.5, position = position_dodge(0.9)) +
labs(title = "Units/sales difference by Promotion with error bar")
mtp %>%
group_by(ad) %>%
summarise(mn = mean(units), sd = sd(units), n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = ad, y = mn)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = mn - ci, ymax = mn + ci), width = 0.5, position = position_dodge(0.9)) +
labs(title = "Units/sales difference by advertisement type with error bar")
Do brand/company units/sales vary based on promotion?
chisq.test(table(mtp$brand, mtp$promo))
##
## Pearson's Chi-squared test
##
## data: table(mtp$brand, mtp$promo)
## X-squared = 180.16, df = 14, p-value < 2.2e-16
How do company units/sales vary with promotion
C_P_n <- mtp %>%
group_by(company, promo) %>%
summarise(n = n())
C_P_n_ci <- multinomialCI(t(C_P_n[, 3]), 0.05)
C_P_tab <- mtp %>%
group_by(company, promo) %>%
summarise(prop = round(n()/sum(nrow(mtp)), 3))
C_P_tab$ci_l <- round(C_P_n_ci[,1], 3)
C_P_tab$ci_u <- round(C_P_n_ci[,2], 3)
htmlTable(C_P_tab)
| company | promo | prop | ci_l | ci_u | |
|---|---|---|---|---|---|
| 1 | GM | 0 | 0.27 | 0.263 | 0.277 |
| 2 | GM | 1 | 0.059 | 0.052 | 0.066 |
| 3 | Kelloggs | 0 | 0.35 | 0.343 | 0.357 |
| 4 | Kelloggs | 1 | 0.102 | 0.095 | 0.109 |
| 5 | Post | 0 | 0.171 | 0.164 | 0.178 |
| 6 | Post | 1 | 0.047 | 0.04 | 0.054 |
# Graph of proportions with confidence intervals
C_P_tab %>%
ggplot(aes(x = promo, y = prop, fill = company)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -4, color = "black", # vjust moves lables above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_l, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
How do company units/sales vary with advertisement
C_A_n <- mtp %>%
group_by(company, ad) %>%
summarise(n = n())
C_A_n_ci <- multinomialCI(t(C_A_n[, 3]), 0.05)
C_A_tab <- mtp %>%
group_by(company, ad) %>%
summarise(prop = round(n()/sum(nrow(mtp)), 3))
C_A_tab$ci_l <- round(C_A_n_ci[,1], 3)
C_A_tab$ci_u <- round(C_A_n_ci[,2], 3)
htmlTable(C_A_tab)
| company | ad | prop | ci_l | ci_u | |
|---|---|---|---|---|---|
| 1 | GM | A | 0.02 | 0.013 | 0.027 |
| 2 | GM | B | 0.012 | 0.005 | 0.02 |
| 3 | GM | NONE | 0.296 | 0.289 | 0.304 |
| 4 | Kelloggs | A | 0.035 | 0.028 | 0.042 |
| 5 | Kelloggs | B | 0.025 | 0.018 | 0.032 |
| 6 | Kelloggs | NONE | 0.393 | 0.386 | 0.4 |
| 7 | Post | A | 0.012 | 0.004 | 0.019 |
| 8 | Post | B | 0.011 | 0.004 | 0.019 |
| 9 | Post | NONE | 0.196 | 0.188 | 0.203 |
# Graph of proportions with confidence intervals
C_A_tab %>%
ggplot(aes(x = ad, y = prop, fill = company)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -4, color = "black", # vjust moves lables above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_l, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
Significance of Correlation
mtp %>%
select_if(is.numeric) %>%
as.matrix() %>%
rcorr()
## week units volume price
## week 1.00 -0.03 -0.02 0.03
## units -0.03 1.00 0.02 -0.19
## volume -0.02 0.02 1.00 0.54
## price 0.03 -0.19 0.54 1.00
##
## n= 21850
##
##
## P
## week units volume price
## week 0.0000 0.0031 0.0000
## units 0.0000 0.0005 0.0000
## volume 0.0031 0.0005 0.0000
## price 0.0000 0.0000 0.0000
Multi-linear regression
# Set up mtp data set for regression
mtp_lm <- mtp %>%
mutate_if(is.integer, as.factor)
# Logit regression with general linear model (glm)
mod <- glm(units ~ company + price + volume + flavor + package + promo + ad,
family = binomial(link='logit'),
data = mtp_lm)
# Review output
summary(mod)
##
## Call:
## glm(formula = units ~ company + price + volume + flavor + package +
## promo + ad, family = binomial(link = "logit"), data = mtp_lm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0883 0.3130 0.3940 0.4767 1.6209
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.36980 0.19672 17.130 < 2e-16 ***
## companyKelloggs -0.55445 0.06866 -8.075 6.74e-16 ***
## companyPost -0.75250 0.09056 -8.309 < 2e-16 ***
## price -0.41682 0.03550 -11.740 < 2e-16 ***
## volume 1.13021 0.09726 11.621 < 2e-16 ***
## flavorCOCOA -0.69062 0.12729 -5.426 5.78e-08 ***
## flavorFRUIT -0.50820 0.13551 -3.750 0.000177 ***
## flavorREGULAR -0.08374 0.12380 -0.676 0.498761
## flavorTOASTED 0.35526 0.11885 2.989 0.002797 **
## packageCUP -0.10245 0.18216 -0.562 0.573853
## promo1 0.40156 0.07932 5.062 4.14e-07 ***
## adB 0.20834 0.18129 1.149 0.250464
## adNONE -0.19905 0.11495 -1.732 0.083329 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13608 on 21849 degrees of freedom
## Residual deviance: 12917 on 21837 degrees of freedom
## AIC: 12943
##
## Number of Fisher Scoring iterations: 5
par(mfrow = c(1, 1))
# units
plot(mtp$units, mod$residuals)
# company
plot(mtp$ad, mod$residuals)
plot(mtp$volume, mod$residuals)
# Flavor
plot(mtp$flavor, mod$residuals)
# ad
plot(mtp$ad, mod$residuals)
# promo
plot(mtp$promo, mod$residuals)
Visualization of Multiple Regression
coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Intercept
## Waiting for profiling to be done...
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
# Order base on p-value
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| price | -0.417 | 0.036 | -11.74 | 0 | -0.486 | -0.347 |
| volume | 1.13 | 0.097 | 11.621 | 0 | 0.94 | 1.322 |
| companyPost | -0.752 | 0.091 | -8.309 | 0 | -0.931 | -0.576 |
| companyKelloggs | -0.554 | 0.069 | -8.075 | 0 | -0.69 | -0.421 |
| flavorCOCOA | -0.691 | 0.127 | -5.426 | 0 | -0.943 | -0.443 |
| promo1 | 0.402 | 0.079 | 5.062 | 0 | 0.248 | 0.559 |
| flavorFRUIT | -0.508 | 0.136 | -3.75 | 0 | -0.776 | -0.245 |
| flavorTOASTED | 0.355 | 0.119 | 2.989 | 0.003 | 0.119 | 0.585 |
| adNONE | -0.199 | 0.115 | -1.732 | 0.083 | -0.431 | 0.021 |
| adB | 0.208 | 0.181 | 1.149 | 0.25 | -0.143 | 0.569 |
| flavorREGULAR | -0.084 | 0.124 | -0.676 | 0.499 | -0.329 | 0.157 |
| packageCUP | -0.102 | 0.182 | -0.562 | 0.574 | -0.452 | 0.264 |
# reorder by p-value
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw()
)
# Use geom_segment to illustrate CI
(g2 <- g1 +
geom_vline(xintercept = 0, color = "red")) +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI, color = "Blue") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI, color = "Blue") +
xlab("Coefficient with Confidence Interval")
Graph for promo finding:
z <- qnorm(0.975) #95 percent
levels(mtp$ad)[1] <- "Mediam Advertisement"
levels(mtp$ad)[2] <- "Small Advertisement"
levels(mtp$ad)[3] <- "No Advertisement"
levels(mtp$promo)[1] <- "Without Promotion"
levels(mtp$promo)[2] <- "With Promotion"
mtp %>%
group_by(promo) %>%
summarise(mn = mean(units), sd = sd(units), n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = reorder(promo, mn) , y = mn)) +
theme_classic() +
geom_bar(stat = "identity", position = "dodge" ,fill = "light blue") +
#geom_errorbar(aes(ymin = mn - ci, ymax = mn + ci), width = 0.5, position = position_dodge(0.9)) +
labs(title = "Promotion has contributed more sales", subtitle = "Cereals with promotion have approximately 42% more sales",x = "", y ="Average weekly units sold", caption = "From Technical Appendix: Statistic EDA") +
scale_y_continuous(breaks = seq(0, 16, 1)) +
coord_flip() +
ggsave(filename = "promo.png")
mtp %>%
group_by(ad) %>%
summarise(mn = mean(units), sd = sd(units), n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = reorder(ad, mn) , y = mn)) +
theme_classic() +
geom_bar(stat = "identity", position = "dodge", fill = "light blue") +
#geom_errorbar(aes(ymin = mn - ci, ymax = mn + ci), width = 0.5, position = position_dodge(0.9)) +
labs(title = "Advertisement has an impact on our sales", subtitle = "Small advertisement have more impact on our sales than mediam advertisement", x = "", y = "Average weekly units sold", caption = "From Technical Appendix: Statistic EDA") +
scale_y_continuous(breaks = seq(0, 16, 1)) +
coord_flip() +
ggsave(filename = "ad.png")
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw() +
geom_vline(xintercept = 0, color = "red") +
labs(title = "What customers makes customer buy more and buy less", subtitle = "Volume, Promotion, Flavor and Small ad have positive influence on sales", caption = "From Technical Appendix: Statistic EDA", x ="Unit Sale Influence level", y ="") +
annotate(geom = "text", label = "Improves Unit sales", x=1, y = 1, color="dark blue") +
annotate(geom = "text", label = "Decrease Unit sales", x=-0.7, y = 1, color="dark red") +
theme_classic() +
ggsave(filename = "coefficient.png")
)
## Saving 7 x 5 in image